################################################################################ 
#
# The distribution of hate speech and its implications for content moderation
# PSRM - Replication package
# Table F16
#
################################################################################ 



################################################################################ 
#  LIBRARIES
################################################################################ 

library(dplyr)       
library(readr)       
library(estimatr)    
library(glmnet)  
library(hdm)    
library(gtools)
library(retrodesign)

rm(list = setdiff(ls(), ls(pattern = "^wd|^setsave$")))

set.seed(42)


################################################################################ 
#   DATA AND FOLDER
################################################################################ 

# read
data = read.csv(paste0(wd_data_processed, '/for_analysis/clean_for_analysis.csv'))
lassodata = read_rds(paste0(wd_data_processed, '/for_analysis/data_lasso.RDS'))

# drop attrited users
data = data[which(data$attrition==0 & data$not_auth_sofar==0 & data$suspended_sofar==0 & data$protected_sofar==0),]

################################################################################ 
#   Prepare 
################################################################################ 

outvars = c('anchor_deleted_12h', 'post_tweets_h_avg')
outvars_name = c( 'Hate Tweet Deleted <12h', 'Probability of Hate Speech') 

treatments = c('alert', 'consequences', 'empathy')
treatments_lab = c( 'Alerting of Hate Speech', 'Warning of Consequences', 'Empathy')


# Scale data
data[outvars] = lapply(data[outvars], scale)
lassodata = as.data.frame(lapply(lassodata[,2:ncol(lassodata)], scale))

################################################################################ 
# Main effects MODELS
################################################################################ 

MODELS = list()

# R-squared vector
rsq <- rep(NA, length(outvars)*length(treatments))

n = 1

for (j in 1:length(outvars)){
  for (i in 1:length(treatments)){
    
    droper = union(which(is.na(data[outvars[j]])), which(data[outvars[j]]==Inf))
    keeper = union(which(data$treat_aggregate == treatments[i]),  which(data$treat_aggregate == 'control')) 
    keeper = setdiff(keeper, droper)
    
    y = data[keeper, outvars[j]]
    d = data$treated[keeper]
    x = lassodata[keeper, ] %>% as.matrix()
    
    print('Data ready')
    
    c = rep(FALSE, ncol(x))
    c[c(2,3)] = TRUE
    
    MODELS[[n]] = rlassoEffect(x, y, d, method = "double selection", 
                               I3 = c)
    
    rsq[n] <- summary(lm(y~x[,MODELS[[n]]$selection.index]))$r.squared
    
    
    print('Done with: ')
    print(n)
    
    n = n+1
    
  }}

# Getting extra info
make.container = function(MOD){
  coef = sapply(MOD, function(m) summary(m)[[1]][1])
  se = sapply(MOD, function(m) summary(m)[[1]][2])
  pval = sapply(MOD, function(m) summary(m)[[1]][4])
  regnum =  sapply(MOD, function(m) length(m$coefficients.reg)-2)
  sample = sapply(MOD, function(m) m$samplesize)
  controls = sapply(MOD, function(m) paste0(names(m$coefficients.reg)[-c(1, 2)], collapse = ', '))
  
  container = cbind.data.frame(yvar = rep(outvars, each=length(treatments)),
                               yvar_name = rep(outvars_name, each=length(treatments)),
                               treat = rep(treatments, time=length(outvars)),
                               treatments_lab = rep(treatments_lab, time=length(outvars)),
                               coef = coef, se=se, pval=pval,
                               samplesize = sample,
                               lasso_cont_num = regnum,
                               controls = controls )
  return(container)
}

container_all = make.container(MODELS)

container_all$rsq <- rsq
container_all$abs_effect <- abs(container_all$coef)
container_all$subset = "Full Sample"

# Function for equivalence testing from https://github.com/ekhartman/rdd_equivalence/tree/master
# This function is originally for RDD equivalence testing, but the reason I use it here is that it uses 
# the estimate and standard error *from the model* to calculate the equivalence confidence interval,
# rather than raw data like the equivtest package.
# The `inverted` value (that we pull out here) corresponds to the `CI_sub` value in the equiv.t.test
# function in the equivtest package; however, the equiv.t.test function requires raw treatment group
# vectors as inputs and so it doesn't account for the covariate adjustment.
# Using this function lets us use the t-distributed effect estimates from the covariate-adjusted model.
test.equiv <- function(est, se, eps, alpha = 0.05, inv_int_search_tol = 0.001, max_search_grid = 3) {
  if(eps < 0) { 
    stop("Epsilon must be > 0 for equivalence t-test.")
  }
  
  p = pchisq(est^2/se^2, 1, eps^2/se^2)
  rej = p < alpha
  
  inverted <- tryCatch(uniroot(function(x) {
    pchisq(est^2/se^2, 1, x^2/se^2) - alpha}, 
    c(0.00001, max(10, max_search_grid * abs(est) * se)), tol = inv_int_search_tol)$root, 
    silent = TRUE, error = function(e) NA)
  
  ## if noncentrality parameter estimate is so close to zero that
  ## p < alpha with at ncp = 0, return NA
  if(pchisq(est^2/se^2, 1) < alpha) {
    inverted = NA
  } else {
    inverted <- tryCatch(uniroot(function(x) {
      pchisq(est^2/se^2, 1, x^2/se^2) - alpha}, 
      c(0.00001, max(10, max_search_grid * abs(est) * se)), tol = inv_int_search_tol)$root, 
      silent = TRUE, error = function(e) NA)
    
    if(is.na(inverted)) warning("No interval found, try increasing max_search_grid.")
  }
  
  return(list(rej = rej, p = p, inverted = inverted))
}


# Power analysis
for(i in 1:nrow(container_all)){
  container_all$power[i] <- power.t.test(delta = container_all$coef[i], n = container_all$samplesize[i], sd = sqrt(1 - container_all$rsq[i]))$power
  container_all$mde[i] <- power.t.test(power = 0.8, n = container_all$samplesize[i], sd = sqrt(1 - container_all$rsq[i]))$delta
  container_all$nec_samplesize[i] <- power.t.test(power = 0.8, delta = container_all$coef[i], sd = sqrt(1 - container_all$rsq[i]))$n
  container_all$gelman_power[i] <- retro_design_closed_form(container_all$coef[i], container_all$se[i])$power
  container_all$type_s[i] <- retro_design_closed_form(container_all$coef[i], container_all$se[i])$type_s
  container_all$type_m[i] <- retro_design_closed_form(container_all$coef[i], container_all$se[i])$type_m
  container_all$power_min[i] <- power.t.test(delta = 0.1, n = container_all$samplesize[i], sd = sqrt(1 - container_all$rsq[i]))$power
  container_all$power_max[i] <- power.t.test(delta = 0.2, n = container_all$samplesize[i], sd = sqrt(1 - container_all$rsq[i]))$power
  container_all$equiv_bound[i] <- test.equiv(est = container_all$coef[i], se = container_all$se[i], eps = 0.5)$inverted
  
}



################################################################################
# create heterogeneity
################################################################################

data$hate_pre = data$no_hate_tweets_pre   # both variables refer to the 30 days before intervention
data$hate_pre[data$no_tweets_pre==0] = 0  # 3 users did not post anything

################################################################################ 
#   Run the heterogeneity models
################################################################################ 


regresssion = function(outvars, treatments, group_ind){
  
  n = 1
  MODELS_controls = list()
  MODELS_r2 = list()
  
  data_temp = data[which(data$group %in% group_ind),]
  lassodata_temp = lassodata[which(data$group %in% group_ind),]
  
  print(paste0("subset length is: ", nrow(data_temp)))
  
  for (j in 1:length(outvars)){
    for (i in 1:length(treatments)){
      
      droper = union(which(is.na(data_temp[outvars[j]])), which(data_temp[outvars[j]]==Inf))
      keeper = union(which(data_temp$treat_aggregate == treatments[i]),  which(data_temp$treat_aggregate == 'control')) 
      keeper = setdiff(keeper, droper)
      
      y = data_temp[keeper, outvars[j]]
      d = data_temp$treated[keeper]
      x = lassodata_temp[keeper, ] %>% as.matrix()
      
      print('Data ready')
      
      c = rep(FALSE, ncol(x))
      c[c(2,3)] = TRUE
      
      MODELS_controls[[n]] = rlassoEffect(x, y, d, method = "double selection", 
                                          I3 = c)
      
      MODELS_r2[[n]] = summary(lm(y~x[,MODELS_controls[[n]]$selection.index]))$r.squared
      
      print('Done with: ')
      print(n)
      
      n = n+1
    }}
  return(list(MODELS_controls, MODELS_r2))
}

################################################################################ 
#   Run everything over median
################################################################################ 

groups = c(1, 2)
data$group = quantcut(data$hate_pre, q = 2, na.rm = TRUE, labels=groups)

MODELS1 = regresssion(outvars, treatments, c(1))
MODELS2 = regresssion(outvars, treatments, c(2))


make_lasso_container = function(list_to_be_transformed, outvars, outvars_name){
  MOD = list_to_be_transformed
  coef = sapply(MOD, function(m) summary(m)[[1]][1])
  se = sapply(MOD, function(m) summary(m)[[1]][2])
  pval = sapply(MOD, function(m) summary(m)[[1]][4])
  regnum =  sapply(MOD, function(m) length(m$coefficients.reg)-2)
  sample = sapply(MOD, function(m) m$samplesize)
  controls = sapply(MOD, function(m) paste0(names(m$coefficients.reg)[-c(1, 2)], collapse = ', '))
  container = cbind.data.frame(yvar = rep(outvars, each=length(treatments)),
                               yvar_name = rep(outvars_name, each=length(treatments)),
                               # categories = rep(categories, each=3),
                               treat = rep(treatments, time=length(outvars)),
                               treatments_lab = rep(treatments_lab, time=length(outvars)),
                               coef = coef, se=se, pval=pval,
                               samplesize = sample,
                               lasso_cont_num = regnum,
                               controls = controls )
  return(container)}

# Get data 1
container1 = make_lasso_container(MODELS1[[1]], outvars, outvars_name)
container1$subset = "Pre-treatment hate speech below median"
container1$abs_effect <- abs(container1$coef)

# Power analysis
for(i in 1:nrow(container1)){
  container1$rsq[i] <- MODELS1[[2]][[i]]
  container1$power[i] <- power.t.test(delta = container1$coef[i], n = container1$samplesize[i], sd = sqrt(1 - container1$rsq[i]))$power
  container1$mde[i] <- power.t.test(power = 0.8, n = container1$samplesize[i], sd = sqrt(1 - container1$rsq[i]))$delta
  container1$nec_samplesize[i] <- power.t.test(power = 0.8, delta = container1$coef[i], sd = sqrt(1 - container1$rsq[i]))$n
  container1$gelman_power[i] <- retro_design_closed_form(container1$coef[i], container1$se[i])$power
  container1$type_s[i] <- retro_design_closed_form(container1$coef[i], container1$se[i])$type_s
  container1$type_m[i] <- retro_design_closed_form(container1$coef[i], container1$se[i])$type_m
  container1$power_min[i] <- power.t.test(delta = 0.1, n = container1$samplesize[i], sd = sqrt(1 - container1$rsq[i]))$power
  container1$power_max[i] <- power.t.test(delta = 0.2, n = container1$samplesize[i], sd = sqrt(1 - container1$rsq[i]))$power
  container1$equiv_bound[i] <- test.equiv(est = container1$coef[i], se = container1$se[i], eps = 0.5)$inverted
}

# Get data 2
container2 = make_lasso_container(MODELS2[[1]], outvars, outvars_name)
container2$subset = "Pre-treatment hate speech above median"
container2$abs_effect <- abs(container2$coef)

# Power analysis
for(i in 1:nrow(container2)){
  container2$rsq[i] <- MODELS2[[2]][[i]]
  container2$power[i] <- power.t.test(delta = container2$coef[i], n = container2$samplesize[i], sd = sqrt(1 - container2$rsq[i]))$power
  container2$mde[i] <- power.t.test(power = 0.8, n = container2$samplesize[i], sd = sqrt(1 - container2$rsq[i]))$delta
  container2$nec_samplesize[i] <- power.t.test(power = 0.8, delta = container2$coef[i], sd = sqrt(1 - container2$rsq[i]))$n
  container2$gelman_power[i] <- retro_design_closed_form(container2$coef[i], container2$se[i])$power
  container2$type_s[i] <- retro_design_closed_form(container2$coef[i], container2$se[i])$type_s
  container2$type_m[i] <- retro_design_closed_form(container2$coef[i], container2$se[i])$type_m
  container2$power_min[i] <- power.t.test(delta = 0.1, n = container2$samplesize[i], sd = sqrt(1 - container2$rsq[i]))$power
  container2$power_max[i] <- power.t.test(delta = 0.2, n = container2$samplesize[i], sd = sqrt(1 - container2$rsq[i]))$power
  container2$equiv_bound[i] <- test.equiv(est = container2$coef[i], se = container2$se[i], eps = 0.5)$inverted
}

# Bind
container = rbind(container_all, container1,container2) %>%
  mutate(coef_mde = ifelse(abs_effect>mde, 1, 0))


power_table <- container %>% 
  mutate(subset2 = case_when(
    subset == "Pre-treatment hate speech below median" ~ "Hate speech below median",
    subset == "Pre-treatment hate speech above median" ~ "Hate speech above median",
    subset == "Full Sample" ~ "Full sample"),
    treat2 = case_when(
      treatments_lab == "Alerting of Hate Speech" ~ "Alert",
      treatments_lab == "Warning of Consequences" ~ "Warning",
      treatments_lab == "Empathy" ~ "Empathy")) %>%
  mutate(treat2 = factor(treat2, levels = c("Alert", "Empathy", "Warning"))) %>%
  select(Sample = subset2, Treatment = treat2, Outcome = yvar_name, 
         `Estimated effect` = coef, `Minimal detectable effect` = mde, `Post-hoc power` = power, 
         `Power based on effect 0.1` = power_min, `Power based on effect 0.2` = power_max,
         `Equivalence bound` = equiv_bound)
xtable::xtable(power_table)

# Generate LaTeX table
latex_table <- xtable::xtable(power_table, caption = "Power analyses")

# Print the LaTeX code
print(latex_table, include.rownames = FALSE, floating = TRUE, 
      booktabs = TRUE, caption.placement = "top")

## Table for paper without post-hoc power:
power_table_paper <- power_table %>% select(-`Post-hoc power`)

# Generate LaTeX table
latex_table_paper <- xtable::xtable(power_table_paper, caption = "Power analyses")

# Print the LaTeX code
print(latex_table_paper, include.rownames = FALSE, floating = TRUE, 
      booktabs = TRUE, caption.placement = "top",
      file = paste0(wd_res, "/tables/tabF16.tex"),)

cat("\n====================\n")
cat("Saved Table F16")
cat("\n====================\n")




